# Here is another example of looking at two variables, # x and y, where we think that there might be a linear # relation between the variables. # In this case we are given the values for x and y # x = 31.9 15.6 18.2 17.3 28.2 20.6 29.5 11.8 25.1 29.2 # y = 28.5 13.6 18.0 15.7 28.4 22.6 21.6 11.6 19.2 21.5 # We need to put these into our environment x <- c(31.9,15.6,18.2,17.3,28.2,20.6,29.5,11.8,25.1,29.2) y <- c(28.5,13.6,18.0,15.7,28.4,22.6,21.6,11.6,19.2,21.5) # then, a simple plot suggests that there is a linear # relation between these two sets of values plot( x, y ) # we can spruce up that plot by making some adjustments plot(x, y, xlim=c(0,40), # sets our x range from 0 to 40 ylim=c(0,30), # sets our y range from 0 to 30 xaxp=c(0,40,8), # sets our x tick marks to # 0 to 40 with 8 intervals yaxp=c(0,30,6), # sets our y tick marks to # 0 to 30 with 6 intervals col="black", # selects BLACK as the color of # our points pch=16) # makes our points appear as # large black dots # give the command ?points to see all the # choices for the dots # then, we can provide some backgrund for our graph. # we will put horizontal and vertical dashed # dark blue lines even with the tick marks abline( h=seq(0,30,5), v=seq(0,40,5), lty="dashed", col="darkblue") # and we can make the axes stand out with black # lines that are a little wider than normal abline(h=0,v=0, col="black", lw=2) # as a small aid ro reading our plot, # let us draw light vertical lines # at each of our x-values abline( v=x, col="orange") # just to review the values that we have x y # It will be a bit better if we can see these # pairs of values in a vertical setting # put these in a dataframe our_view <- data.frame(x,y) our_view # Looking at the points in the graph, I think # that a line going through (0,10) and (40,30) # might be a good representation of the relation # between x and y. # We can draw such a line on the graph if we know # the intercept and the slope. # The intercept is at (0,10) and the slope is # 20/40 or 1/2 abline( 10, 1/2, col="blue", lw=2) # We have specific x values. We want to find the # expected y-value for each of those known # x-values. # To do this we just use the equation # y = 10 + (1/2)*x # with each of the x values that we have expect_1 <- 10+(1/2)*x expect_1 #we can append those to our dataframe our_view$expect_1 <- expect_1 our_view # and we can add those expected values to our plot points( x, expect_1, col="blue", pch=0) # Now we want to find how far off the observed # values are from their corresponding expected # values, i.e., find the observed - expected our_view$diff_1 <- y - expect_1 our_view # now find the square of the observed - expected our_view$sqr_diff_1 <- our_view$diff_1^2 our_view #then add up all of those squared differences # we could use sum() but we will use cumsum() # to get this our_view$total_1 <- cumsum( our_view$sqr_diff_1) our_view # so our sum of the squared differences between the # observed y values and the expected y-values # is 107.08 # That was interesting, but can we find a line # that is a better representation of the linear # relation between x and y. We will know if it # better if it produces a lower sum of the squared # differences between the observed and the # expected values. # let us try another line... # how about the line through (0,0) and (35,30) # that give the intercept at (0,0) and the slope is # 30/35 or 6/7 abline( 0, 6/7, col="red", lw=2) # On that line the expected values, # using y=0 + (6/7)*x are expect_2 <- 0+(6/7)*x expect_2 # let us create a second datafrome for this our_view_2 <- data.frame( x, y, expect_2) our_view_2 # add those expected values to our plot points( x, expect_2, col="darkred", bg="red", pch=23) # then find the observed - expected our_view_2$diff_2 <- y - expect_2 our_view_2 # now find the square of the observed - expected our_view_2$sqr_diff_2 <- our_view_2$diff_2^2 our_view_2 #then add up all of those squared differences # we could use sum() but we will use cumsum() # to get this our_view_2$total_2 <- cumsum( our_view_2$sqr_diff_2) our_view_2 # so our sum of the squared differences between the # observed y values and the expected y-values # is 83.821837 # Therefore, our second line, the red one, has # a lower sum of the squared differences # between the observed values and the expected # values than the sum of the squared differences # for the first line, the blue one. # That makes the blue line a better fit # for our data. ################ # we could try other lines that we might think up, # but the question is, what is the # equation of the best line? # to find this we use the lm() command to get our # linear model. lm_hold <- lm( y ~ x ) lm_hold # this tells us that the best fitting line has # an intercept of 3.924 and a slope of 0.710. # That is, the best line is # y = 3.924 + 0.710*x # because we saved our linear model we can draw # that line on our graph abline( lm_hold, col="green", lw=2) # There is actually a lot more to our model than # just the intercept and the slope. We can see some # of this by using the summary() function. summary( lm_hold ) # In that output we find both the intercept # and the slope, but with more decimal places. # We can even pull the intercept and the slope # out of the model to see even more decimal # places. We do this with the coefficients() # function. coeffs <- coefficients( lm_hold ) coeffs # In particular, then, # coeffs[1] holds the intercept # and coeffs[2] holds the slope. # Let us go through the same process that we # did above to find the sum of the squared # differences between observed and expected # values using these new intercept and slope # values. our_view_3 <- data.frame( x, y ) # get our new expected values our_view_3$expect_3 <- coeffs[1] + coeffs[2]*x our_view_3 # then the observed - expected values our_view_3$diff_3 <- y - our_view_3$expect_3 our_view_3 # then find the squared differences our_view_3$sqrd_diff <- our_view_3$diff_3^2 our_view_3 # # Then get the sum of the squared differences our_view_3$cum_sum <- cumsum( our_view_3$sqrd_diff) our_view_3 # that sum, 71.119232, is lower than was the # sum for our red line. Not only is this a # better fit line this is the best fit line! # Before we leave this look at our linear # regression there is one more topic. Above we # saw that we could find the difference between # the observed values, our y-values, and the # expected values, those that we put into # our_view_3$expect_3. Above we stored those # differences in our_view_3$diff_3. # we have a special name for such differences. # They are called the residuals. We can # calculate the residuals, as we did above. # For example, in our data the sixth pair of # x,y values is (20.6, 22.6). Thus, when # x is 20.6 the observed y value is 22.6. # When x is 20.6, the expected value is coeffs[1] + coeffs[2] * 20.6 # which means that the residual value when x=20.6 # is 22.6 - (coeffs[1] + coeffs[2] * 20.6) # of course, wee calculated that before as shown # in the sixth line of our_view_3 # However, since we stored our linear model # in the variable lm_hold we can use the function # residuals() to pull all of the residual values # out of the model. residuals( lm_hold ) # It helpful to get the residual values because # we want to look at a plot of the x-values and # the residual values. We want to see that the # points in that plot are all over the place, not # forming some sort of pattern. With only 10 points # in our data it is hard to see that these are # spread out and not in a pattern. plot( x, residuals( lm_hold) ) ###################### # Now, we have seen that we can find the best # fit regression line. What we want now is some # measure of just how good a fit that is. # we do that with the correlation coefficient. cor( x, y ) #The correlation coefficient is always a value # between -1 and 1. The square of the # correlation coefficient tells us the percent # of the variability we see in our data that is # "explained" by the linear regression model. # in our example this is 0.8682721^2 # so our linear model, y = 3.924 + 0.710*x, # explains just over 75% of the variations seen # in our data. # # To be sure, we actually generated # the x and y values above using source("../gnrnd4.R") gnrnd4( key1=1345920906, key2=6120650913, key3=23000100 ) # Let us generate a larger set of data and then # quickly run through doing a plot, getting the # coefficients of the regression equation, # getting and plotting the residuals, then then # finding the correlation coefficient. gnrnd4( key1=1345922306, key2=4185651309, key3=23000100 ) plot( L1, L2 ) #clean up that plot plot( L1, L2, xlim=c(0,40), # sets our x range from 0 to 40 ylim=c(0,60), # sets our y range from 0 to 30 xaxp=c(0,40,8), # sets our x tick marks to # 0 to 40 with 8 intervals yaxp=c(0,60,12), # sets our y tick marks to # 0 to 60 with 12 intervals col="black", # selects BLACK as the color of # our points pch=16, # makes our points appear as # large black dots las=1, #makes our axis values appear # in a horizontal alignment cex.axis=0.5 #scales the axis values to 1/2 # their usual size ) # we will put horizontal and vertical dashed # dark blue lines even with the tick marks abline( h=seq(0,60,5), v=seq(0,40,5), lty="dashed", col="darkblue") # and we can make the axes stand out with black # lines that are a little wider than normal abline(h=0,v=0, col="black", lw=2) my_line <- lm( L2 ~ L1 ) my_line # so our regression equation is # y = 63.986 + -1.521*x abline( my_line ) plot( L1, residuals( my_line )) cor( L1, L2 ) cor( L1, L2 )^2 # so our model explains about 93.6% of the # variability of the data # you might note that had we done summary( my_line ) # that 0.9365 appears as the # Multiple R-squared: 0.9365